package org.whuims.core.dataprocessor;

import org.apache.commons.io.FileUtils;

import java.io.File;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Created by cheng on 2016/11/11.
 */
public class TermPatternMatcher {
    private String termPatString = "[\\s]+\r\n\\s?([\\u4E00-\\u9FFF\\s]+)\\s+([\\w\\s]+).+?\\r\\n([\\u4E00-\\u9FFF\\s\\w]+)。";
    private String text;

    public TermPatternMatcher(String text) {
        this.text = text;
    }

    public static void main(String[] args) {
        try {
            String text = FileUtils.readFileToString(new File("D:\\work\\ideaproject\\stdmem\\src\\main\\webapp\\resource\\txt\\CN-DL\\DL_T 5390-2007_488.txt"));
            TermPatternMatcher mat = new TermPatternMatcher(text);
            mat.extact();
        } catch (IOException e) {
            e.printStackTrace();
        }

    }

    public void extact() {
        Pattern pattern = Pattern.compile(termPatString, Pattern.MULTILINE);
        Matcher matcher = pattern.matcher(text);
        while (matcher.find()) {
            System.out.println("全部\t" + matcher.group(0).replaceAll("\\s+", ""));
            System.out.println("中文\t" + matcher.group(1).replaceAll("\\s+", ""));
            System.out.println("英文\t" + matcher.group(2).replaceAll("\\s+", ""));
            System.out.println("解释\t" + matcher.group(3).replaceAll("\\s+", ""));
            System.out.println();
        }
    }
}
